{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ec1b089a-015e-478b-b54a-b45ec8e93d38",
   "metadata": {},
   "outputs": [],
   "source": [
    "%run -i \"../util/lang_utils.ipynb\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "cd743787-8e5f-40d6-91c2-63bc5dab8658",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from spacy.cli.train import train\n",
    "from spacy.cli.evaluate import evaluate\n",
    "from spacy.tokens import DocBin\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "340f2878-3ed3-4145-b5b1-2e8ef29dda4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Music NER\n",
    "# https://github.com/deezer/music-ner-eacl2023"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b1616ada-127d-42e7-811b-e4777fbc6bac",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        id                                               text  start_offset  \\\n",
      "0    13434  i love radioheads kid a something similar | ki...             7   \n",
      "1    13434  i love radioheads kid a something similar | ki...            61   \n",
      "2    13435                anything similar to i fight dragons            20   \n",
      "3    13436                music similar to ccrs travelin band            17   \n",
      "4    13437                 songs similar to blackout by boris            17   \n",
      "..     ...                                                ...           ...   \n",
      "422  14028  songs like good news by mac miller | preferrab...            11   \n",
      "423  14028  songs like good news by mac miller | preferrab...            24   \n",
      "424  14030  something along the lines of either the chain ...            49   \n",
      "425  14030  something along the lines of either the chain ...            29   \n",
      "426  14032       heavy bass x gothic rap like oxygen by bones            29   \n",
      "\n",
      "     end_offset                  label  \n",
      "0            17           Artist_known  \n",
      "1            71  Artist_or_WoA_deduced  \n",
      "2            35            WoA_deduced  \n",
      "3            30         Artist_deduced  \n",
      "4            25            WoA_deduced  \n",
      "..          ...                    ...  \n",
      "422          20            WoA_deduced  \n",
      "423          34         Artist_deduced  \n",
      "424          60  Artist_or_WoA_deduced  \n",
      "425          45  Artist_or_WoA_deduced  \n",
      "426          44  Artist_or_WoA_deduced  \n",
      "\n",
      "[427 rows x 5 columns]\n"
     ]
    }
   ],
   "source": [
    "music_ner_df = pd.read_csv('../data/music_ner.csv')\n",
    "print(music_ner_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "cf41447f-5a3a-4fb0-bd9f-b70b096e35c6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        id                                               text  start_offset  \\\n",
      "0    13434  i love radioheads kid a something similar | ki...             7   \n",
      "1    13434  i love radioheads kid a something similar | ki...            61   \n",
      "2    13435                anything similar to i fight dragons            20   \n",
      "3    13436                music similar to ccrs travelin band            17   \n",
      "4    13437                 songs similar to blackout by boris            17   \n",
      "..     ...                                                ...           ...   \n",
      "422  14028  songs like good news by mac miller | preferrab...            11   \n",
      "423  14028  songs like good news by mac miller | preferrab...            24   \n",
      "424  14030  something along the lines of either the chain ...            49   \n",
      "425  14030  something along the lines of either the chain ...            29   \n",
      "426  14032       heavy bass x gothic rap like oxygen by bones            29   \n",
      "\n",
      "     end_offset          label  \n",
      "0            17   Artist_known  \n",
      "1            71  Artist_or_WoA  \n",
      "2            35            WoA  \n",
      "3            30         Artist  \n",
      "4            25            WoA  \n",
      "..          ...            ...  \n",
      "422          20            WoA  \n",
      "423          34         Artist  \n",
      "424          60  Artist_or_WoA  \n",
      "425          45  Artist_or_WoA  \n",
      "426          44  Artist_or_WoA  \n",
      "\n",
      "[427 rows x 5 columns]\n"
     ]
    }
   ],
   "source": [
    "# Change labels to Artist, Artist_or_WoA or WoA\n",
    "def change_label(input_label):\n",
    "    input_label = input_label.replace(\"_deduced\", \"\")\n",
    "    return input_label\n",
    "\n",
    "music_ner_df[\"label\"] = music_ner_df[\"label\"].apply(change_label)\n",
    "print(music_ner_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02b63652-f328-4a89-be1f-b7e2af1b036c",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_db = DocBin()\n",
    "test_db = DocBin()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "c59b1ad1-211f-496e-a393-c3ba07bb4d2f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "227\n",
      "170\n",
      "57\n"
     ]
    }
   ],
   "source": [
    "# Get a unique list of unique ids\n",
    "ids = list(set(music_ner_df[\"id\"].values))\n",
    "print(len(ids))\n",
    "# Split ids into training and test\n",
    "train_ids, test_ids = train_test_split(ids)\n",
    "print(len(train_ids))\n",
    "print(len(test_ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "0c78c9e9-5351-4c7c-b2cc-8a8d4b501e2e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nghtmre street 0 14 Artist_or_WoA\n",
      "anar 17 21 Artist_or_WoA\n",
      "nujabes atlas 25 38 Artist_or_WoA\n",
      "gsh 21 24 Artist_or_WoA\n",
      "gaye 25 29 Artist_or_WoA\n",
      "save yourself stabbing westward 28 59 Artist_or_WoA\n",
      "zimmers 54 61 Artist\n",
      "blade runner 2049 62 79 WoA\n",
      "the llamas with hats 21 41 Artist\n",
      "outro 42 47 WoA\n",
      "bon iver 17 25 Artist\n",
      "iron & wine 30 41 Artist\n",
      "tally hall 23 33 Artist\n",
      "miracle musical 37 52 Artist\n",
      "system of a down 21 37 Artist\n",
      "amon tobin & kid koala 30 52 Artist\n",
      "untitled 53 61 WoA\n",
      "code orange 0 11 Artist\n",
      "dreams in inertia 12 29 WoA\n",
      "code orange 37 48 Artist\n",
      "the sounds of eden 29 47 Artist_or_WoA\n",
      "blackbear and gnash 48 67 Artist_or_WoA\n",
      "the muffs tilt 57 71 Artist\n",
      "be your own pet 72 87 Artist\n",
      "the soviettes tweens 88 108 Artist\n",
      "dog party 109 118 Artist\n",
      "ach so gern 11 22 WoA\n",
      "kid rocks 29 38 Artist\n",
      "greatest show on earth 39 61 WoA\n",
      "airport bar 16 27 WoA\n",
      "noah 31 35 Artist\n",
      "sweet 17 22 Artist\n",
      "smino sudan 13 24 Artist\n",
      "archives 25 33 Artist\n",
      "fjk 34 37 Artist\n",
      "jessie reyez 38 50 Artist\n",
      "tash sultana 54 66 Artist\n",
      "an awesome wave 33 48 WoA\n",
      "alt j 52 57 Artist\n",
      "i love you like a alcoholic 17 44 WoA\n",
      "tax payers 45 55 Artist_or_WoA\n",
      "this song is not about a girl 15 44 WoA\n",
      "flume & chet faker 48 66 Artist\n",
      "little known game 55 72 Artist_or_WoA\n",
      "guardians of the galaxy 27 50 WoA\n",
      "the black keys 11 25 Artist\n",
      "i got mine 26 36 WoA\n",
      "solitude standing 29 46 WoA\n",
      "suzanne vega 50 62 Artist\n",
      "metro booming 7 20 Artist\n",
      "no complaints 21 34 WoA\n",
      "drake 39 44 Artist\n",
      "sneakin 45 52 WoA\n",
      "atlantis 23 31 WoA\n",
      "bridgit mendler 35 50 Artist\n",
      "a star is born 29 43 WoA\n",
      "bones 73 78 Artist_or_WoA\n",
      "deadindesignerclothes 79 100 Artist_or_WoA\n",
      "internetboi 103 114 Artist_or_WoA\n",
      "cindy 17 22 WoA\n",
      "tammany hall nyc 26 42 Artist\n",
      "the likes 33 42 WoA\n",
      "rage against the machine 46 70 Artist\n",
      "bring me the horizon 71 91 WoA\n",
      "dz deathrays 96 108 WoA\n",
      "the honeymoon era 0 17 WoA\n",
      "paiige 21 27 Artist\n",
      "seeya 37 42 WoA\n",
      "deadmau5 and colleen dagostino 46 76 Artist\n",
      "the chain 19 28 WoA\n",
      "fleetwood mac 32 45 Artist\n",
      "koi child 43 52 Artist\n",
      "sturgill simpson 38 54 Artist_or_WoA\n",
      "post malone 37 48 Artist\n",
      "leave 49 54 WoA\n",
      "radioheads 7 17 Artist\n",
      "aphex twin 61 71 Artist_or_WoA\n",
      "i fight dragons 20 35 WoA\n",
      "ccrs travelin 17 30 Artist\n",
      "blackout 17 25 WoA\n",
      "boris 29 34 Artist\n",
      "zoosters breakout 11 28 WoA\n",
      "hans zimmer 32 43 Artist\n",
      "trios 17 22 Artist\n",
      "da da da 23 31 WoA\n",
      "radioheads 19 29 Artist\n",
      "everything in its right place 30 59 WoA\n",
      "black sheep 38 49 WoA\n",
      "scott pilgrim vs the world 55 81 Artist\n",
      "nier automatas 17 31 WoA\n",
      "osts 0 4 Artist\n",
      "princess and the frog 28 49 Artist\n",
      "hadestown 54 63 Artist\n",
      "black parade 63 75 WoA\n",
      "my chemical romance 79 98 Artist\n",
      "crushed up 29 39 WoA\n",
      "future 43 49 Artist\n",
      "arctic monkeys 6 20 Artist\n",
      "green day 21 30 Artist\n",
      "nirvana 31 38 Artist\n",
      "matt champion 32 45 WoA\n",
      "brockhampton 49 61 Artist\n",
      "kid cudis 11 20 Artist\n",
      "something beautifull need to breathe 16 52 Artist_or_WoA\n",
      "overcooked 48 58 WoA\n",
      "wayfaring strangers 29 48 WoA\n",
      "johnny cash 52 63 Artist\n",
      "flylo 11 16 Artist_or_WoA\n",
      "time to move 21 33 WoA\n",
      "carmen 37 43 Artist\n",
      "andrew huang 0 12 Artist\n",
      "300000 note song 13 29 WoA\n",
      "buttercup 11 20 WoA\n",
      "jack stauber 24 36 Artist\n",
      "the nights 23 33 WoA\n",
      "avicii 36 42 Artist\n",
      "hold my liquor 16 30 WoA\n",
      "kanye west 34 44 Artist\n",
      "pheonixs 29 37 Artist\n",
      "love like a sunset 38 56 WoA\n",
      "blank space 17 28 WoA\n",
      "i prevail 32 41 Artist\n",
      "esbe 135 139 Artist\n",
      "fulfill the dream 140 157 WoA\n",
      "funeral 59 66 WoA\n",
      "phoebe bridges 70 84 Artist\n",
      "trapeze swing 85 98 WoA\n",
      "iron and wine 102 115 Artist\n",
      "within the rose 118 133 WoA\n",
      "matthew and the atlas 137 158 Artist\n",
      "soad 19 23 Artist\n",
      "scott pilgrim vs the world 0 26 Artist_or_WoA\n",
      "coal chamber 0 12 WoA\n",
      "el cu cuy 13 22 Artist\n",
      "atb 26 29 WoA\n",
      "allday 38 44 Artist\n",
      "evil woman 24 34 WoA\n",
      "black sabbath 38 51 Artist\n",
      "tyler the creator 22 39 Artist\n",
      "chance the rapper 44 61 Artist\n",
      "elvis 109 114 Artist\n",
      "bob crosby 115 125 Artist\n",
      "i took a pill in ibiza 17 39 WoA\n",
      "seeb 40 44 Artist\n",
      "remix in the middle 45 64 WoA\n",
      "dj snake 65 73 Artist\n",
      "lush life 74 83 WoA\n",
      "zara larsson 84 96 Artist\n",
      "be the one 97 107 WoA\n",
      "dua lipa 110 118 Artist\n",
      "akiko shikatas 17 31 Artist\n",
      "coco 83 87 WoA\n",
      "endless summer 17 31 WoA\n",
      "trust 37 42 Artist\n",
      "guardians of the galaxy 18 41 WoA\n",
      "ten feet 15 23 WoA\n",
      "curio 27 32 Artist\n",
      "spottieottiedopaliscious 11 35 WoA\n",
      "outcast 39 46 Artist\n",
      "gnaw 11 15 WoA\n",
      "alex g 19 25 Artist\n",
      "worakls 11 18 Artist_or_WoA\n",
      "hermeto pascoal 5 20 Artist\n",
      "the piper at the gates of dawn 0 30 WoA\n",
      "pink floyd 31 41 Artist\n",
      "lindécis 0 8 Artist\n",
      "post malone 26 37 Artist\n",
      "psycho 38 44 WoA\n",
      "milky chance running 15 35 WoA\n",
      "little lion man 11 26 WoA\n",
      "mumford & sons 30 44 Artist\n",
      "6 underground 11 24 WoA\n",
      "sneaker pimps 28 41 Artist\n",
      "bishop gunn 33 44 Artist\n",
      "sopor aeternus 17 31 Artist_or_WoA\n",
      "cherokee numb 0 13 Artist_or_WoA\n",
      "westworld 40 49 WoA\n",
      "the sunslammer cement city 35 61 WoA\n",
      "homestuck 71 80 Artist\n",
      "good news 11 20 WoA\n",
      "mac miller 24 34 Artist\n",
      "just my imagination 17 36 WoA\n",
      "the temptations 40 55 Artist\n",
      "m2us 15 19 Artist\n",
      "magnolia 20 28 WoA\n",
      "either the chain 29 45 Artist_or_WoA\n",
      "mr blue sky 49 60 Artist_or_WoA\n",
      "oxygen by bones 29 44 Artist_or_WoA\n",
      "perry comos 19 30 Artist\n",
      "anema e core 31 43 WoA\n",
      "39 drift away 20 33 WoA\n",
      "hotel california 34 50 Artist\n",
      "waves 29 34 WoA\n",
      "the bahamas 38 49 Artist\n",
      "daydreaming from a moon 5 28 WoA\n",
      "shaped pool 29 40 WoA\n",
      "radiohead 44 53 Artist\n",
      "lady leshurr 57 69 Artist\n",
      "sa roc 73 79 Artist\n",
      "yesterday 45 54 WoA\n",
      "the beatles 58 69 Artist\n",
      "the alternative stranger 23 47 WoA\n",
      "skrillex 53 61 Artist\n",
      "slip away 11 20 WoA\n",
      "mad season 24 34 Artist\n",
      "in this moment 11 25 WoA\n",
      "pretty reckless 29 44 WoA\n",
      "shes gone away 29 43 WoA\n",
      "nin 47 50 Artist\n",
      "childish gambinos 61 78 Artist\n",
      "bonfire 79 86 WoA\n",
      "blacks 74 80 Artist\n",
      "wonderful life 81 95 WoA\n",
      "lou reeds 96 105 Artist\n",
      "perfect day 106 117 WoA\n",
      "coldplays 122 131 Artist\n",
      "dont panic 132 142 WoA\n",
      "nursery bbno$ 58 71 WoA\n",
      "say it aint so 11 25 WoA\n",
      "weezer 29 35 Artist\n",
      "yoe mase 23 31 Artist\n",
      "ptsd 32 36 WoA\n",
      "morrissey and the smiths 11 35 Artist_or_WoA\n",
      "the first minute 16 32 WoA\n",
      "bad guy 36 43 WoA\n",
      "billie eilish 47 60 Artist\n",
      "the main squeeze tauk 11 32 Artist\n",
      "roderick porter 15 30 Artist\n",
      "baby 31 35 WoA\n",
      "morphine robocobra quartett 17 44 Artist_or_WoA\n",
      "white pony 27 37 WoA\n",
      "deftones 41 49 Artist\n",
      "devil and god are raging inside 53 84 WoA\n",
      "brand new 91 100 Artist\n",
      "virtual self particle ants 28 54 Artist_or_WoA\n",
      "photek 31 37 Artist_or_WoA\n",
      "brothers brights 21 37 Artist\n",
      "awake o sleeper 38 53 WoA\n",
      "of monsters and men 21 40 Artist_or_WoA\n",
      "lost boys 34 43 Artist\n",
      "banger 44 50 WoA\n",
      "no church in the wild 20 41 Artist_or_WoA\n",
      "i love you baby 44 59 WoA\n",
      "core 11 15 WoA\n",
      "rl grime 19 27 Artist\n",
      "16 17 19 WoA\n",
      "highly suspect 23 37 Artist\n",
      "devil devil 11 22 WoA\n",
      "milck 26 31 Artist\n",
      "prom queen 35 45 WoA\n",
      "molly kate kestner 49 67 Artist\n",
      "capital cities 17 31 Artist_or_WoA\n",
      "help im alive 57 70 WoA\n",
      "metric 74 80 Artist\n",
      "you and me and jack 22 41 WoA\n",
      "willing to give gold 42 62 WoA\n",
      "mark engel 100 110 Artist\n",
      "khaen whims 17 28 Artist_or_WoA\n",
      "mindless self indulgence 11 35 Artist\n",
      "rhcp 11 15 Artist\n",
      "give it away 16 28 WoA\n",
      "january 17 24 WoA\n",
      "verzache 28 36 Artist\n",
      "taylor swift 40 52 Artist\n",
      "reputation 53 63 WoA\n",
      "dua lipa 89 97 Artist\n",
      "woodstock 17 26 WoA\n",
      "jon bellion 30 41 Artist\n",
      "the less i know the better 45 71 WoA\n",
      "tame impala 75 86 Artist\n",
      "fka twigs 33 42 Artist\n",
      "drive by 18 26 WoA\n",
      "lil peep 30 38 Artist\n",
      "willie colon 10 22 Artist\n",
      "la murga 23 31 WoA\n",
      "kiwi 34 38 WoA\n",
      "harry styles 42 54 Artist\n",
      "together 16 24 WoA\n",
      "martin garrix 25 38 Artist\n",
      "let me feel 43 54 WoA\n",
      "nicky romero&vicetone 55 76 Artist\n",
      "dont stay 17 26 WoA\n",
      "x ambassadors 30 43 Artist\n",
      "elephant 34 42 WoA\n",
      "sebastian wibe 43 57 Artist\n",
      "troy baker 17 27 Artist\n",
      "the smiths 58 68 Artist\n",
      "the hives 15 24 Artist\n",
      "felix cartal 11 23 Artist\n",
      "lights love me 28 42 Artist_or_WoA\n",
      "danse macabre 11 24 WoA\n",
      "the oh hellos 28 41 Artist\n",
      "trudy and the romance 17 38 Artist\n",
      "futures 25 32 Artist\n",
      "mask off 33 41 WoA\n",
      "let em talk 43 54 WoA\n",
      "the mini ladds 11 25 Artist_or_WoA\n",
      "a place only we know 18 38 WoA\n",
      "kashiwa daisuke 34 49 Artist_or_WoA\n",
      "chris stapleton 11 26 Artist\n",
      "first aid kit 30 43 Artist_or_WoA\n",
      "muse 11 15 Artist\n",
      "plug in baby 16 28 WoA\n",
      "snowy white midnight blues 33 59 Artist_or_WoA\n",
      "keytalk and gogovanillas 48 72 Artist\n",
      "lana del rey 34 46 Artist\n",
      "the weeknd 47 57 Artist\n",
      "joji and hozier 58 73 Artist\n",
      "nevermind 23 32 WoA\n",
      "leonard cohen 36 49 Artist\n",
      "lewis del mar 4 17 Artist\n",
      "paris 41 46 WoA\n",
      "the chainsmokers 50 66 Artist\n",
      "seventh son 16 27 Artist_or_WoA\n",
      "operation mind crime 32 52 Artist_or_WoA\n",
      "raven link change 27 44 WoA\n",
      "caver 52 57 Artist\n",
      "i see you 17 26 WoA\n",
      "kygo 30 34 Artist\n",
      "billy raffoul 38 51 Artist\n",
      "sufjan 20 26 Artist\n",
      "radiohead 39 48 Artist\n",
      "talking heads 49 62 WoA\n",
      "car seat headrest 63 80 Artist_or_WoA\n",
      "the strokes 85 96 Artist\n",
      "dragons 27 34 Artist\n",
      "dogma into free dangan 35 57 WoA\n",
      "gungor 14 20 Artist\n",
      "i am mountain 21 34 WoA\n",
      "starlight brigade 44 61 WoA\n",
      "rise or live to win 65 84 WoA\n",
      "all dead all dead 0 17 WoA\n",
      "queen 18 23 Artist\n",
      "dulce 12 17 WoA\n",
      "flofilz 21 28 Artist\n",
      "amour plastique 11 26 WoA\n",
      "zhu&nero 62 70 Artist\n",
      "dreams 71 77 WoA\n",
      "goodbye love 23 35 WoA\n",
      "car seat headrest 39 56 Artist\n",
      "snowing 32 39 Artist\n",
      "just friends 43 55 Artist\n",
      "act your age 16 28 WoA\n",
      "bliss n eso 32 43 Artist\n",
      "glisten interlude 10 27 WoA\n",
      "jeremy zucker 28 41 Artist\n",
      "rev theorys 24 35 Artist\n",
      "hell yeah 36 45 WoA\n",
      "great for mdma 20 34 WoA\n",
      "j dillas 12 20 Artist_or_WoA\n",
      "donuts 21 27 Artist_or_WoA\n",
      "glass animals 17 30 Artist_or_WoA\n",
      "rise against 47 59 WoA\n",
      "dream koala 24 35 Artist_or_WoA\n",
      "deer creek canyon 17 34 WoA\n",
      "sera cahoone 38 50 Artist\n",
      "gunship and the midnight 24 48 Artist_or_WoA\n",
      "fleetwood mac 25 38 Artist\n",
      "the chain 39 48 WoA\n",
      "crosby 51 57 Artist\n",
      "stills 58 64 WoA\n",
      "nash 69 73 Artist\n",
      "helplessly hoping 74 91 WoA\n",
      "better days 15 26 WoA\n",
      "desso 30 35 Artist\n",
      "crywank 11 18 Artist_or_WoA\n",
      "still woozy 19 30 Artist\n",
      "kiss from a rose 11 27 WoA\n",
      "bonnie prince billys 11 31 Artist\n",
      "rubin and cherise 41 58 Artist_or_WoA\n",
      "dont get any closer 32 51 WoA\n",
      "eluvium 55 62 Artist\n",
      "brothers keeper 0 15 WoA\n",
      "anderson paak 19 32 Artist\n",
      "materialistic sugar baby anthems 0 32 Artist_or_WoA\n",
      "buttercup 33 42 WoA\n",
      "jack stauber 43 55 Artist\n",
      "mighty 11 17 Artist_or_WoA\n",
      "coco 45 49 WoA\n",
      "el michels 36 46 Artist\n",
      "affair 47 53 WoA\n",
      "david lynch 47 58 Artist\n",
      "novos baianos 28 41 Artist\n",
      "veridis quo 11 22 WoA\n",
      "daft punk 26 35 Artist\n",
      "george jeff 21 32 Artist_or_WoA\n",
      "jaden smith 33 44 Artist_or_WoA\n",
      "so long see you tomorrow bombay bicycle club 0 44 Artist_or_WoA\n",
      "the format 19 29 Artist\n",
      "haley smalls 0 12 Artist\n",
      "voodoo doll 13 24 WoA\n",
      "college & electric youth 21 45 Artist\n",
      "a real hero 46 57 WoA\n",
      "drive 60 65 WoA\n",
      "data she kid velo 56 73 Artist\n",
      "sebastian 74 83 Artist\n",
      "the magnificent seven 20 41 WoA\n",
      "the clash 45 54 Artist\n",
      "grateful dead 38 51 WoA\n",
      "fischerspooner 28 42 Artist_or_WoA\n",
      "flume 43 48 Artist_or_WoA\n",
      "hermitude 52 61 Artist_or_WoA\n",
      "anthony russo 0 13 Artist\n",
      "yew nork 14 22 WoA\n",
      "grizzly bear will calls 11 34 Artist_or_WoA\n",
      "diplo 37 42 Artist_or_WoA\n",
      "timberlake 40 50 Artist\n",
      "snl 66 69 Artist\n",
      "dick in a box 80 93 WoA\n",
      "dance gavin 20 31 Artist\n",
      "bishops knife trick 29 48 Artist_or_WoA\n",
      "fall out boy 49 61 Artist_or_WoA\n",
      "blue side jhope 66 81 Artist_or_WoA\n",
      "stranger things 32 47 WoA\n",
      "dynoro 37 43 Artist\n",
      "bowsprit 17 25 WoA\n",
      "balmorhea 29 38 Artist\n",
      "générique 69 78 WoA\n",
      "miles davis 82 93 Artist\n",
      "oloff 0 5 Artist\n",
      "sun rays 6 14 WoA\n",
      "colde sunflower colde shh offonoff dance 100 140 Artist_or_WoA\n",
      "hardest geometry problem in the world 0 37 WoA\n",
      "mark mothersbaugh 38 55 Artist\n",
      "hoziers 49 56 Artist\n",
      "someone new 57 68 WoA\n",
      "elle kings 69 79 Artist\n",
      "exes and ohs 80 92 WoA\n"
     ]
    }
   ],
   "source": [
    "# Go through the list of ids and get all the rows associated with each id\n",
    "for id in ids:\n",
    "    entity_rows = music_ner_df.loc[music_ner_df['id'] == id]\n",
    "    text = entity_rows.head(1)[\"text\"].values[0]\n",
    "    doc = small_model(text)\n",
    "    ents = []\n",
    "    for index, row in entity_rows.iterrows():\n",
    "        label = row[\"label\"]\n",
    "        start = row[\"start_offset\"]\n",
    "        end = row[\"end_offset\"]\n",
    "        span = doc.char_span(start, end, label=label, alignment_mode=\"contract\")\n",
    "        ents.append(span)\n",
    "    doc.ents = ents\n",
    "    if id in train_ids:\n",
    "        train_db.add(doc)\n",
    "    else:\n",
    "        test_db.add(doc)\n",
    "train_db.to_disk('../data/music_ner_train.spacy')\n",
    "test_db.to_disk('../data/music_ner_test.spacy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "e785a427-1cd4-4686-94d2-ec223612672a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[38;5;4mℹ Saving to output directory: ../models/spacy_music_ner\u001b[0m\n",
      "\u001b[38;5;4mℹ Using CPU\u001b[0m\n",
      "\u001b[1m\n",
      "=========================== Initializing pipeline ===========================\u001b[0m\n",
      "\u001b[38;5;2m✔ Initialized pipeline\u001b[0m\n",
      "\u001b[1m\n",
      "============================= Training pipeline =============================\u001b[0m\n",
      "\u001b[38;5;4mℹ Pipeline: ['tok2vec', 'tagger', 'parser', 'ner']\u001b[0m\n",
      "\u001b[38;5;4mℹ Initial learn rate: 0.001\u001b[0m\n",
      "E    #       LOSS TOK2VEC  LOSS TAGGER  LOSS PARSER  LOSS NER  TAG_ACC  DEP_UAS  DEP_LAS  SENTS_F  ENTS_F  ENTS_P  ENTS_R  SCORE \n",
      "---  ------  ------------  -----------  -----------  --------  -------  -------  -------  -------  ------  ------  ------  ------\n",
      "  0       0          0.00        85.39       265.35     63.00    35.26    22.92    14.84     3.66    0.00    0.00    0.00    0.18\n",
      "  5     200        747.41      2873.85     10642.29   3960.21    77.27    69.25    59.52    55.17   37.07   36.19   38.00    0.60\n",
      " 12     400        700.66       254.46      2183.84    370.70    77.92    72.38    63.15    82.26   44.55   44.12   45.00    0.63\n",
      " 21     600        914.24       163.11      1288.12    407.55    79.24    69.08    61.67    71.21   39.80   40.62   39.00    0.62\n",
      " 32     800       1066.64       128.50      1143.84    421.63    78.09    72.22    64.30    79.03   40.41   41.94   39.00    0.62\n",
      " 45    1000       1464.56       117.67       734.04   1159.56    79.90    69.41    62.32    76.80   43.39   46.07   41.00    0.63\n",
      " 61    1200        941.88        90.95       917.36     60.92    80.40    69.41    62.32    72.31   42.93   45.05   41.00    0.63\n",
      " 80    1400       1385.22        87.17       843.28     98.64    79.90    70.90    62.82    77.17   44.92   48.28   42.00    0.64\n",
      "104    1600        953.55        78.03       586.17    153.99    80.07    71.56    64.63    74.60   45.23   45.45   45.00    0.64\n",
      "133    1800        970.64        65.13       730.98     63.91    80.40    71.06    62.82    72.31   43.75   45.65   42.00    0.64\n",
      "168    2000       4200.54        70.03       697.10   1060.07    80.56    69.91    61.34    74.42   40.61   41.24   40.00    0.62\n",
      "211    2200       4038.66        84.34       721.41   1291.99    80.40    70.57    63.97    76.19   48.08   46.30   50.00    0.65\n",
      "261    2400        372.10        52.12       670.31      1.77    80.89    72.55    65.46    84.55   47.67   49.46   46.00    0.66\n",
      "311    2600        546.39        37.73       670.02     20.32    81.22    71.23    64.14    69.17   45.10   44.23   46.00    0.65\n",
      "361    2800        824.87        32.01       666.86     10.36    81.22    70.73    63.48    81.60   47.18   48.42   46.00    0.65\n",
      "411    3000       1248.57        29.03       571.53     90.03    80.72    68.76    63.15    68.18   48.98   50.00   48.00    0.65\n",
      "461    3200        744.53        24.98       604.77     20.52    81.05    69.08    61.67    80.65   51.28   52.63   50.00    0.66\n",
      "511    3400      10586.64        27.44       607.36   1063.01    79.74    71.89    64.14    70.07   53.06   54.17   52.00    0.67\n",
      "561    3600       2715.90        29.15       736.47     95.58    80.89    73.37    64.96    73.02   47.92   50.00   46.00    0.66\n",
      "611    3800       1497.09        24.71       694.38     24.16    79.74    69.58    62.82    75.00   47.47   47.96   47.00    0.64\n",
      "661    4000       1339.60        22.34       618.29     45.45    81.71    71.23    63.97    79.03   43.06   41.28   45.00    0.64\n",
      "711    4200       1100.50        20.76       619.86     21.44    80.07    71.72    64.30    70.68   47.67   49.46   46.00    0.65\n",
      "761    4400       1548.19        19.02       649.54     27.96    80.23    69.41    63.31    80.95   43.75   45.65   42.00    0.63\n",
      "811    4600        792.71        17.04       533.41     45.45    80.23    71.89    65.46    76.80   47.00   47.00   47.00    0.65\n",
      "861    4800        785.91        16.64       558.07     10.96    80.07    72.38    64.63    75.97   47.37   50.00   45.00    0.65\n",
      "911    5000       2307.66        17.14       556.16     86.73    80.07    70.90    63.64    78.74   43.08   44.21   42.00    0.64\n",
      "\u001b[38;5;2m✔ Saved pipeline to output directory\u001b[0m\n",
      "../models/spacy_music_ner/model-last\n"
     ]
    }
   ],
   "source": [
    "# Train the model\n",
    "train(\"../data/spacy_config_ner.cfg\", output_path=\"../models/spacy_music_ner\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "f54c0fcd-14bb-4ad8-80a5-bae2940e6462",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "songs with themes of being unable to settle | ex hoziers someone new elle kings exes and ohs\n",
      "Gold entities:\n",
      "hoziers\n",
      "someone new\n",
      "elle kings\n",
      "exes and ohs\n",
      "Predicted entities: \n",
      "hoziers\n",
      "someone new\n",
      "elle kings\n",
      "exes and\n"
     ]
    }
   ],
   "source": [
    "# Use the trained model for prediction\n",
    "nlp = spacy.load(\"../models/spacy_music_ner/model-last\")\n",
    "first_test_id = test_ids[0]\n",
    "test_rows = music_ner_df.loc[music_ner_df['id'] == first_test_id]\n",
    "input_text = entity_rows.head(1)[\"text\"].values[0]\n",
    "print(input_text)\n",
    "print(\"Gold entities:\")\n",
    "for index, row in entity_rows.iterrows():\n",
    "    label = row[\"label\"]\n",
    "    start = row[\"start_offset\"]\n",
    "    end = row[\"end_offset\"]\n",
    "    span = doc.char_span(start, end, label=label, alignment_mode=\"contract\")\n",
    "    print(span)\n",
    "doc = nlp(input_text)\n",
    "print(\"Predicted entities: \")\n",
    "for entity in doc.ents:\n",
    "    print(entity)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "b48c16ff-be97-41fe-afdc-d07e40ca2355",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'token_acc': 1.0,\n",
       " 'token_p': 1.0,\n",
       " 'token_r': 1.0,\n",
       " 'token_f': 1.0,\n",
       " 'tag_acc': 0.800658978583196,\n",
       " 'sents_p': 0.7352941176470589,\n",
       " 'sents_r': 0.847457627118644,\n",
       " 'sents_f': 0.7874015748031497,\n",
       " 'dep_uas': 0.7089859851607585,\n",
       " 'dep_las': 0.6364385820280297,\n",
       " 'dep_las_per_type': {'root': {'p': 0.6176470588235294,\n",
       "   'r': 0.711864406779661,\n",
       "   'f': 0.6614173228346457},\n",
       "  'prep': {'p': 0.819047619047619, 'r': 0.86, 'f': 0.8390243902439023},\n",
       "  'det': {'p': 0.8372093023255814, 'r': 0.9, 'f': 0.8674698795180723},\n",
       "  'amod': {'p': 0.7678571428571429,\n",
       "   'r': 0.7166666666666667,\n",
       "   'f': 0.7413793103448276},\n",
       "  'pobj': {'p': 0.7333333333333333,\n",
       "   'r': 0.7857142857142857,\n",
       "   'f': 0.7586206896551724},\n",
       "  'nsubj': {'p': 0.5217391304347826,\n",
       "   'r': 0.5454545454545454,\n",
       "   'f': 0.5333333333333332},\n",
       "  'relcl': {'p': 0.3333333333333333,\n",
       "   'r': 0.2222222222222222,\n",
       "   'f': 0.26666666666666666},\n",
       "  'dobj': {'p': 0.75, 'r': 0.5294117647058824, 'f': 0.6206896551724139},\n",
       "  'advmod': {'p': 0.38461538461538464,\n",
       "   'r': 0.21739130434782608,\n",
       "   'f': 0.27777777777777773},\n",
       "  'compound': {'p': 0.5375, 'r': 0.6615384615384615, 'f': 0.593103448275862},\n",
       "  'nmod': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'npadvmod': {'p': 0.3333333333333333,\n",
       "   'r': 0.16666666666666666,\n",
       "   'f': 0.2222222222222222},\n",
       "  'cc': {'p': 0.6666666666666666,\n",
       "   'r': 0.6666666666666666,\n",
       "   'f': 0.6666666666666666},\n",
       "  'appos': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'dep': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'dative': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'conj': {'p': 0.5, 'r': 0.391304347826087, 'f': 0.4390243902439025},\n",
       "  'aux': {'p': 0.625, 'r': 0.625, 'f': 0.625},\n",
       "  'xcomp': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'neg': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'advcl': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'csubj': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'nummod': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'intj': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'csubjpass': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'auxpass': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'agent': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'ccomp': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'acomp': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'mark': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'expl': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'attr': {'p': 0.0, 'r': 0.0, 'f': 0.0},\n",
       "  'acl': {'p': 0.0, 'r': 0.0, 'f': 0.0}},\n",
       " 'ents_p': 0.4421052631578947,\n",
       " 'ents_r': 0.42,\n",
       " 'ents_f': 0.4307692307692308,\n",
       " 'ents_per_type': {'WoA': {'p': 0.4358974358974359,\n",
       "   'r': 0.425,\n",
       "   'f': 0.43037974683544306},\n",
       "  'Artist_or_WoA': {'p': 0.1,\n",
       "   'r': 0.09090909090909091,\n",
       "   'f': 0.09523809523809525},\n",
       "  'Artist': {'p': 0.5217391304347826,\n",
       "   'r': 0.4897959183673469,\n",
       "   'f': 0.5052631578947369}},\n",
       " 'speed': 3835.591242612551}"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Evaluate the model\n",
    "evaluate('../models/spacy_music_ner/model-last', '../data/music_ner_test.spacy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2fb8310-a87c-4713-ae2c-fd178a28030a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
